print("World Bank")
print(5)
print(5.5)
print(True)
Data taken from World Bank Repository
import pandas as pd
main_data = pd.read_csv("/content/API_4_DS2_en_csv_v2_1741864.csv", skiprows= 4)
main_data.head()
import pandas as pd
country_data=pd.read_csv("/content/Metadata_Country_API_4_DS2_en_csv_v2_1741864.csv")
country_data.head()
main_data.columns
main_data['Indicator Name'].unique()
main_data_unem=main_data[main_data['Indicator Name']=='Unemployment, female (% of female labor force) (modeled ILO estimate)']
main_data_unem.head()
main_data_unem= main_data_unem[['Country Name', 'Country Code','1991', '1992', '1993', '1994', '1995',
'1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
'2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018', '2019', '2020']]
main_data_unem.head()
country_data.columns
country_data.columns
country_data=country_data[['Country Code', 'Region', 'IncomeGroup']]
country_data.head()
merged_data = pd.merge(main_data_unem, country_data, on='Country Code')
merged_data.head()
merged_data.columns
merged_data=merged_data[['Country Name', 'Country Code', '1991', '1992', '1993', '1994', '1995',
'1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
'2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018', '2019', '2020', 'Region',
'IncomeGroup']]
merged_data.head()
merged_data.isna()
merged_data.isna().sum().head()
The shape of merged_data
merged_data.isna().shape
merged_data_clean = merged_data.dropna()
merged_data_clean.head()
This is the new size once the dataset is cleaned up
merged_data_clean.shape
grouped_data_income = merged_data_clean.groupby(['IncomeGroup']).mean()
grouped_data_income
grouped_data_income_describe = merged_data_clean.groupby(['IncomeGroup']).describe()
grouped_data_income_describe
transpose_income_group = grouped_data_income.transpose()
transpose_income_group
import matplotlib.pyplot as plt
%matplotlib inline
transpose_income_group.plot(figsize=(20,10))
plt.title("Mean % Female Unemployment")
plt.xlabel("Year")
plt.ylabel("% UnEmployment")
grouped_data_region = merged_data_clean.groupby(['Region']).describe()
grouped_data_region = merged_data_clean.groupby(['Region']).mean()
transposed_region = grouped_data_region.transpose()
transposed_region.plot(figsize=(20,10))
plt.title('Mean % Female Unemployment by Region')
plt.xlabel('Year')
plt.ylabel("% UnEmployment")
import scipy.stats as stats
merged_data_clean.head()
income_groups = merged_data_clean['IncomeGroup'].unique()
print(income_groups)
len(income_groups)
from IPython.display import display
with pd.option_context('display.max_rows', 299, 'display.max_columns', 40):
display(merged_data_clean) #need display to show all data
#how many partipant countries
CC_countries = merged_data_clean['Country Code'].unique()
print(CC_countries)
print(len(CC_countries))
CC_IG = merged_data_clean[['Country Code','IncomeGroup']]
print(CC_IG)
print(len(CC_IG))
CC_IG_GP= CC_IG.groupby(['Country Code']).describe()
print(CC_IG_GP)
print(len(CC_IG_GP))
IG_IG_GP= CC_IG.groupby(['IncomeGroup']).describe()
print(IG_IG_GP)
print(len(IG_IG_GP))
CC_IG_2019 = merged_data_clean[['Country Code','2019','IncomeGroup']]
print(CC_IG_2019)
print(len(CC_IG_2019))
import statistics
grouped_data_income = merged_data_clean.groupby(['IncomeGroup']).mean()
grouped_data_income
#grouped_data_income_2019= grouped_data_income(['2019'])
#Data2019=merged_data_clean['2019']
#mode = statistics.mode(transpose_income_group.groupby([]))
#print(mode)
income_groups
print(income_groups)
income_groups[0]
for income_group in income_groups:
print(income_group)
income_group_data=[]
for i in range (len(income_groups)):
income_group_data.append(merged_data_clean['2019'][merged_data_clean['IncomeGroup']==income_groups[i]])
income_group_data
statistic, pvalue = stats.f_oneway(income_group_data[0],
income_group_data[1],
income_group_data[2],
income_group_data[3])
print("statistic: %s pvalue %s" %(statistic,pvalue))
regions = merged_data_clean['Region'].unique()
print(regions)
regions_data=[]
for i in range (len(regions)):
regions_data.append(merged_data_clean['2019'][merged_data_clean['Region']==regions[i]])
statistic, pvalue = stats.f_oneway(regions_data[0],
regions_data[1],
regions_data[2],
regions_data[3],regions_data[4],
regions_data[5],regions_data[6])
print("statistic: %s pvalue %s" %(statistic,pvalue))
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
ax.set_title("Boxplot of % Female Unemployment by Income Group")
ax.set
ax.boxplot(income_group_data, labels = income_groups, showmeans =True)
plt.xlabel("Country Income Group")
plt.ylabel("% Female Unemployment")
plt.show()
Analysis of variance (ANOVA) is a statistical technique that is used to check if the means of two or more groups are significantly different from each other. ANOVA checks the impact of one or more factors by comparing the means of different samples. Analytics Vidhya%20is,the%20means%20of%20different%20samples.&text=Another%20measure%20to%20compare%20the%20samples%20is%20called%20a%20t%2Dtest)
In our case, the impact of the factor Income Group to the different means of the % Female Unemployment will be analysed.
Hypothesis Testing - Analysis of Variance (ANOVA)
The null hypothesis in ANOVA is always that there is no difference in means.
H0: mu1=mu2=mu3=mu4
The alternative hypothesis is always that the means are not all equal
H1: means are not all equal
The test statistic for testing H0: μ1 = μ2 = ... = μk is:
And follows the table of calculations:
Assumptions while calculating test statisitic F:
Groups are the income groups: IG1, IG2, IG3, IG4
Sample Size for each group: n1 =60, n2=29, n3=48,n4=50
Sample mean: mu1, mu2, mu3, mu4
Sample standard deviation: s1,s2,s3,s4
k=len(pd.unique(merged_data_clean.IncomeGroup))
N=len(merged_data_clean.values)
#Degrees of Freedom Between Treatments
df1= k-1
#Degrees of Freedom within Treatments
df2 = N-k
#Total Degrees of Freedom
dfT=N-1
print(k,N, df1, df2, dfT)
n0=merged_data_clean.groupby('IncomeGroup').size()[0]
n1=merged_data_clean.groupby('IncomeGroup').size()[1]
n2=merged_data_clean.groupby('IncomeGroup').size()[2]
n3=merged_data_clean.groupby('IncomeGroup').size()[3]
N=n0+n1+n2+n3
print(n0,n1,n2,n3,n0+n1+n2+n3)
grandmu=(merged_data_clean['2019'].sum()/N)
print(grandmu)
We start by calculating the Sum of Squares between. Sum of Squares Between is the variability due to interaction between the groups. Sometimes known as the Sum of Squares of the Model.
print(n0,n1,n2,n3)
merged_data_clean_IG_2019=merged_data_clean[['2019','IncomeGroup']]
IG0=merged_data_clean_IG_2019[merged_data_clean_IG_2019['IncomeGroup']=='High income']
IG1=merged_data_clean_IG_2019[merged_data_clean_IG_2019['IncomeGroup']=='Low income']
IG2=merged_data_clean_IG_2019[merged_data_clean_IG_2019['IncomeGroup']=='Lower middle income']
IG3=merged_data_clean_IG_2019[merged_data_clean_IG_2019['IncomeGroup']=='Upper middle income']
IG0mu_2=((sum(IG0['2019']))/n0)
IG1mu_2=((sum(IG1['2019']))/n1)
IG2mu_2=((sum(IG2['2019']))/n2)
IG3mu_2=((sum(IG3['2019']))/n3)
SSB= n0*(IG0mu_2 -grandmu )**2 + n1*(IG1mu_2-grandmu)**2 + n2*(IG2mu_2-grandmu)**2 + n3*(IG3mu_2-grandmu)**2
SSB
MSSB= SSB/df1
MSSB
The variability in the data due to differences within each group.
import statistics
IG0_ss=statistics.pvariance(IG0['2019'])
IG1_ss=statistics.pvariance(IG1['2019'])
IG2_ss=statistics.pvariance(IG2['2019'])
IG3_ss=statistics.pvariance(IG3['2019'])
#this calculation was not included in SSW. it didnt result in an accurate
def variance(datav, ddof=1):
n = len(datav)
mean = sum(datav) / n
return sum((x - mean) ** 2 for x in datav) / (n - ddof)
IG0_sss=variance(IG0['2019'])
IG1_sss=variance(IG1['2019'])
IG2_sss=variance(IG2['2019'])
IG3_sss=variance(IG3['2019'])
SSW=(n0-1)*IG0_sss +(n1-1)*IG1_sss +(n2-1)*IG2_sss +(n3-1)*IG3_sss
SSW
MSSW = SSW/df2
MSSW
F=MSSB/MSSW
F
from scipy import stats
p= stats.f.sf(F,dfT, df2)
p
One rejects the the null hypothesis, H0 , if the computed F-static is greater than the critical F-statistic. The critical F-statistic is determined by the degrees of freedom and alpha value. In our case, 1-tailed , alpha= 0,05, dof = 186 so critical F = 2.347
Reject H0 if calulated F-statistics > critical F-statistic: 5.05 > 2.347
We reject the null hypothesis H0 because p<= 0.05
The % of Female Unemployment rate was measured across different income groups.
The purpose of calculating ANOVA was to see if averages of the values of % of Female Unemployment across the different Income Groups were statistically different.
We can now report that Income Group factor greatly alters the average of the % Female Unployment rate for the year 2019.
!pip install researchpy
import researchpy as rp
rp.summary_cont(merged_data_clean['2019'])
rp.summary_cont(merged_data_clean_IG_2019.groupby(merged_data_clean_IG_2019['IncomeGroup']))
Sum of Squares Total will be needed to calculate eta-squared later. This is the total variability in the data:
SStotal = SSB+SSW
SStotal
Install the library
#One-Way ANOVA
!pip install pingouin
import pingouin as pg
aov= pg.anova(dv='2019', between='IncomeGroup',data= merged_data_clean,detailed=True)
aov
the impact of the factor Region to the different means of the % Female Unemployment
aov= pg.anova(dv='2019', between='Region',data= merged_data_clean,detailed=True)
aov
What follows next is unpivoting the main.data from wide to long format, optionally leaving identifiers set using melt function. One column has all the identifiers that later on, we will be selecting only one identifier % Female Unemployement and only one year 2019 of data from the column 'Year'.
main_data_m=main_data.melt(id_vars=['Country Code', 'Indicator Name'],value_vars=['1991', '1992', '1993', '1994', '1995',
'1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
'2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018', '2019', '2020'])
main_data_m.head()
main_data_m= main_data_m.rename(columns={'variable': 'Year'})
main_data_m.head()
main_data_m_clean=main_data_m.dropna()
main_data_m_clean.head()
import pandas as pd
import folium
import csv
import json
stage = main_data_m_clean
stage
main_data_m_clean_year=main_data_m_clean[main_data_m_clean['Year']=='2019']
main_data_m_clean_year_ind=main_data_m_clean_year[main_data_m_clean_year['Indicator Name']=='Unemployment, female (% of female labor force) (modeled ILO estimate)']
main_data_m_clean_year_ind
data_to_plot = main_data_m_clean_year_ind[['Country Code','value']]
data_to_plot['Country Code'].unique()
data_to_plot
hist_indicator = main_data_m_clean_year_ind.iloc[0]['Indicator Name']
hist_indicator
!wget --quiet https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DV0101EN/labs/Data_Files/world_countries.json
print('GeoJSON file downloaded!')
wc=r'world_countries.json'
world1 = folium.Map(location=[0, 0], zoom_start=2)
world1.choropleth(geo_data =wc ,
data = data_to_plot,
columns =
['Country Code', 'value'],
key_on='feature.id',
fill_color = 'YlOrRd',
fill_opacity =0.8 ,
line_opacity = 0.1,
legend_name ='%Female Unemployment')
world1
The darker is the color, the higher is the % Unemployment
world = folium.Map(location=[0, 0], zoom_start=2, tiles = 'stamenwatercolor')
world.choropleth(geo_data =wc ,
data = data_to_plot,
columns =
['Country Code', 'value'],
key_on='feature.id',
fill_color = 'YlOrRd',
fill_opacity =0.8 ,
line_opacity = 0.1,
legend_name ='%Female Unemployment')
world
A Tile control has been added on the top right of the map.
worlds = folium.Map(location=[0, 0], zoom_start=2, tiles = 'cartodbpositron')
worlds.choropleth(geo_data =wc ,
data = data_to_plot,
columns =
['Country Code', 'value'],
key_on='feature.id',
fill_color = 'YlOrRd',
fill_opacity =0.8 ,
line_opacity = 0.1,
legend_name ='%Female Unemployment')
folium.TileLayer('Stamen Terrain').add_to(worlds)
folium.TileLayer('Stamen Toner').add_to(worlds)
folium.TileLayer('Stamen Water Color').add_to(worlds)
folium.TileLayer('cartodbpositron').add_to(worlds)
folium.TileLayer('cartodbdark_matter').add_to(worlds)
folium.LayerControl().add_to(worlds)
worlds
worldss = folium.Map(location = [0,0], zoom_start=2,tiles=None)
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(worldss)
Change the length of the colorcodes in the color bar
myscale = (data_to_plot['value'].quantile((0,0.1,0.75,0.9,0.98,1))).tolist()
worldss.choropleth(
geo_data=wc,
name='Choropleth',
data=data_to_plot,
columns=['Country Code','value'],
key_on="feature.id",
fill_color='YlGnBu',
threshold_scale=myscale,
fill_opacity=1,
line_opacity=0.2,
legend_name='% Female Unemployment ',
smooth_factor=0
)
worldss
!pip install geopandas
import geopandas as gpd
import folium
import branca.colormap as cm
wc=r'world_countries.json'
data_url = r'countries.geojson'
data_url = 'https://datahub.io/core/geo-countries/datapackage.json'
fname =r'world_countries.json'
worldc = gpd.read_file(fname)
worldc.columns
worldc
worldc_data=worldc[['id', 'name','geometry']]
worldc_data.head()
worldc_datar = worldc_data.rename(columns = {'id': 'Country Code'}, inplace = False)
worldc_datar
data_to_plot
world_datad=pd.merge(worldc_datar, data_to_plot, on='Country Code')
world_datadc=world_datad[['name','Country Code','geometry','value']]
world_datadc
world_datado=world_datad[['name','Country Code','geometry', 'value' ]]
world_datado.head()
world_datado['Country Code'].unique().shape
world_datado.isna().shape
world_datadoo=world_datado.dropna()
world_datadoo['Country Code'].unique().shape
myscale = (world_datado['value'].quantile((0,0.1,0.75,0.9,0.98,1))).tolist()
worldss = folium.Map(location=[5,5], zoom_start=2,tiles=None, control_scale=True)
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(worldss)
#makes boundaries plot
Boundaries = folium.GeoJson(
worldc,
style_function = lambda x: {
'color': 'black',
'weight': 1,
"opacity":1,
'fillOpacity': 0,
}).add_to(worldss)
colormap = cm.linear.YlGnBu_03.to_step(data=world_datado['value'],
method='quant', quantiles=[0,0.1,0.75,0.9,0.98,1])
colormap
#worldss = folium.Map(location=[5,5], zoom_start=2,tiles=None)
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(worldss)
colormap.caption = "% Female Unemployment"
style_function = lambda x: {'fillColor': colormap(x['properties']['value']),
'color':'#000000',
'fillOpacity': 10,
'weight': 2}
highlight_function = lambda x: {'fillColor': '#000000',
'color':'#000000',
'fillOpacity': 1,
'weight': 1}
NIL = folium.features.GeoJson(
world_datado,
style_function=style_function,
control=False,
highlight_function=highlight_function,
tooltip=folium.features.GeoJsonTooltip(
fields=['name','value'],
aliases=['Country Name: ','% Female Unemployment'],
localize = True,
style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;")
)
)
worldss.add_child(NIL)
worldss.keep_in_front
#folium.LayerControl().add_to(worldss)
worldss
folium.TileLayer('CartoDB positron',name="Light Map",control=False).add_to(worldss)
colormap.caption = "% Female Unemployment"
style_function = lambda x: {'fillColor': colormap(x['properties']['value']),
'color':'#000000',
'fillOpacity': 2,
'weight': 1}
highlight_function = lambda x: {'fillColor': '#000000',
'color':'#000000',
'fillOpacity': 1,
'weight': 1}
NIL = folium.features.GeoJson(
world_datado,
style_function=style_function,
control=False,
highlight_function=highlight_function,
tooltip=folium.features.GeoJsonTooltip(
fields=['name','value'],
aliases=['Country Name: ','% Female Unemployment'],
localize = True,
style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;")
)
)
worldss.add_child(NIL)
worldss.keep_in_front
folium.TileLayer('Stamen Terrain').add_to(worlds)
folium.TileLayer('Stamen Toner').add_to(worlds)
folium.TileLayer('Stamen Water Color').add_to(worlds)
folium.TileLayer('cartodbpositron').add_to(worlds)
folium.TileLayer('cartodbdark_matter').add_to(worlds)
folium.LayerControl().add_to(worldss)
worldss